#include <cassert>
#include <functional>
#include <stdexcept>

#include "gpt.h"

#include "kernel/activation_types.h"
#include "kernel/addbias.h"
#include "kernel/embedding.h"
#include "kernel/findmax.h"
#include "kernel/gather_last_tokens.h"
#include "kernel/layernorm.h"
#include "kernel/rmsnorm.h"
#include "layer/attention.h"
#include "layer/ffn.h"
#include "layer/gated_ffn.h"
#include "util/torch_utils.h"
#include "util/st_datatypes.h"
#include "util/nccl_utils.h"

namespace st::model {

// Gpt<T>::Gpt - Constructor
template<typename T>
Gpt<T>::Gpt(const GptHyperParam& hyper_param, 
			const GptPagedAttnParam& pagedattn_param, 
			GptParallelismParam& parallelism_param 
		)
{
	this->hyper_param = hyper_param;
	this->pagedattn_param = pagedattn_param;
	this->parallelism_param = parallelism_param;
	this->parallelism_param.init_by_hyper_param(hyper_param);
	this->weight.init(hyper_param, parallelism_param);
}

// Gpt<T>::Gpt - Destructor
template<typename T>
Gpt<T>::~Gpt() {
	util::stNcclDestroy(tensor_para_comm);
}

template<typename T>
void Gpt<T>::setPagedattnParam(const GptPagedAttnParam& pagedattn_param) {
	this->pagedattn_param = pagedattn_param;
}

template<typename T>
void Gpt<T>::setParallelismParam(const GptParallelismParam& parallelism_param){
	this->parallelism_param = parallelism_param;
}

template<typename T>
void Gpt<T>::init_communicator(const ncclUniqueId& tp_id, const ncclUniqueId& pp_id){
	tensor_para_comm = util::stNcclInit(
		parallelism_param.tensor_para_size,
		parallelism_param.tensor_para_rank,
		tp_id,
		0,
		true
	);

	pipeline_para_comm = util::stNcclInit(
		parallelism_param.pipeline_para_size,
		parallelism_param.pipeline_para_rank,
		pp_id,
		0,
		parallelism_param.tensor_para_rank == 0
	);
}

// Gpt<T>::loadWeight - Load the weight from model_path
// The weight should be produced by scripts/convert-XXX.py
template<typename T>
void Gpt<T>::loadWeight(const std::string& model_path) {
	this->weight.loadWeight(model_path);
}

template<typename T>
void Gpt<T>::initDummyWeight() {
	this->weight.initDummyWeight();
}

// Generate d_position_ids
// This variable will be used in forwardDecoder when RoPE is used
template<typename T>
void Gpt<T>::getInputPosiIds(
	const std::vector<std::vector<int64_t>> &input_tokens_batched,
	const std::vector<int64_t> &first_token_indexes,
	const int64_t num_tokens
) {
	int64_t* h_position_ids = new int64_t[num_tokens];	// [num_tokens]
	int64_t ptr = 0;
	for (int i = 0; i < (int)first_token_indexes.size(); i++) {
		for (int j = 0; j < (int64_t)input_tokens_batched[i].size(); j++) {
			h_position_ids[ptr] = first_token_indexes[i] + j;
			ptr++;
		}
	}
	assert(ptr == num_tokens);
	d_position_ids.remalloc(num_tokens);
	CUDA_CHECK(cudaMemcpy(d_position_ids.ptr, h_position_ids, sizeof(int64_t) * num_tokens, cudaMemcpyHostToDevice));
	delete[] h_position_ids;
}

// Gpt<T>::inputBatchEmbedAndPosiEncode - Embed && Positional encode a batch of tokens.
// The function will flat the tokens of batched requests into 1-D array to do embedding,
// and then do positional encoding according to the index of each request's first token
// in its whole sentence. For example, if request i is in decoding phase, and it has already
// generated 5 tokens, then first_token_indexes[i] = 5.
template<typename T>
void Gpt<T>::inputBatchEmbedAndPosiEncode(
	T* d_output,
	const std::vector<std::vector<int64_t>> &input_tokens_batched,
	const int64_t num_tokens
) {
	// Generate token_ids
	int64_t* h_token_ids = new int64_t[num_tokens];		// [num_tokens]
	int64_t ptr = 0;
	for (int i = 0; i < (int64_t)input_tokens_batched.size(); i++) {
		for (int j = 0; j < (int64_t)input_tokens_batched[i].size(); j++) {
			h_token_ids[ptr] = input_tokens_batched[i][j];
			ptr++;
		}
	}
	assert(ptr == num_tokens);

	// Copy token_ids and position_ids to GPU
	d_token_ids.remalloc(num_tokens);
	CUDA_CHECK(cudaMemcpy(d_token_ids.ptr, h_token_ids, sizeof(int64_t) * num_tokens, cudaMemcpyHostToDevice));

	// Embedding & Encoding
	st::kernel::embedAndPosiEncodeBatched(
		d_output,
		d_token_ids.ptr,
		d_position_ids.ptr,
		weight.embed_tokens_weight,
		this->hyper_param.is_rotary_posi_embedding ? nullptr : weight.embed_positions_weight,
		num_tokens,
		hyper_param.hidden_size
	);
	sync_check_cuda_error();

	delete[] h_token_ids;
}

// Gpt<T>::selectOutputTokenBatched - Select the most possible token from the output of the
// decoder (the hidden state of the last token generated by the final layernorm layer)
// The function first performs a GEMM operation, and then find the maximum value in the
// output projection buffer (greedy selection).
template<typename T>
void Gpt<T>::selectOutputTokenBatched(
	int64_t* h_result_token,	// cpu, [batch_size]
	const T* d_input,		// [num_tokens, hidden_size]
	int64_t num_tokens,
	const int64_t* d_first_token_indexes,
	int64_t batch_size
) {
	output_projection_last_tokens_buf.remalloc(batch_size*hyper_param.hidden_size);
	output_projection_buf.remalloc(batch_size*hyper_param.vocab_size);
	output_projection_result_buf.remalloc(batch_size);

	// Step 0: Gather tokens.
	// Before this step, the layout of d_input looks like:
	// | Prompt or last token | Prompt or last token | ... | Prompt or last token |
	// If a request is in the context stage, then "Prompt or last token" contains (the length of) the prompt
	// number of tokens. Otherwise, if a request is in the decoding stage, then it contains the last token.
	// 
	// After this step, the layout of d_input looks like:
	// | last token | last token | ... | last token |
	//
	// The reason why we need this step is that, only the last tokens from the each request
	// need to be decoded.
	const T* gathered_tokens;	// [batch_size, hidden_size]
	if (num_tokens != batch_size) {
		st::kernel::gatherLastTokens(
			output_projection_last_tokens_buf.ptr,
			d_input,
			num_tokens,
			batch_size,
			hyper_param.hidden_size,
			d_first_token_indexes
		);
		gathered_tokens = output_projection_last_tokens_buf.ptr;
		sync_check_cuda_error();
	} else {
		// No gathering required
		gathered_tokens = d_input;
	}

	// Step 1: GEMM
	cublas_wrapper.gemm(
		CUBLAS_OP_N,
		CUBLAS_OP_T,
		batch_size,
		hyper_param.vocab_size,
		hyper_param.hidden_size,
		gathered_tokens,
		weight.output_proj_weight,
		output_projection_buf.ptr
	);
	sync_check_cuda_error();

	// Step 2: Findmax
	st::kernel::findmaxBatched(
		output_projection_result_buf.ptr,
		output_projection_buf.ptr,
		batch_size,
		hyper_param.vocab_size
	);
	sync_check_cuda_error();

	// Step 3: Copy the result to CPU
	cudaMemcpy(
		h_result_token,
		output_projection_result_buf.ptr,
		batch_size * sizeof(int64_t),
		cudaMemcpyDeviceToHost
	);
	sync_check_cuda_error();
}

// Gpt<T>::forwardDecoder - Run forward propagation of the decoder
// The decoder consists of a stack of N transformer blocks and one layernorm (final layernorm) layer.
template<typename T>
void Gpt<T>::forwardDecoder(
	T* d_output,							// [num_tokens, hidden_size]
	const T* d_input,						// [num_tokens, hidden_size]
	T* d_k_cache,							// [num_blocks, num_layers, local_kv_head_num, block_size, head_dim]
	T* d_v_cache,							// [num_blocks, num_layers, local_kv_head_num, block_size, head_dim]
	int64_t* d_block_table,					// [batch_size, max_num_block_per_seq]
	const int64_t* d_input_len,				// [batch_size], gpu. The length of the i-th input. For context stage it is the number of tokens in user's request, while for decoder stage it is the number of previous tokens 

	const int64_t* h_input_len,			// [batch_size], cpu
	const bool* h_is_context_stage,	// [batch_size], cpu. Whether the i-th input is in context stage or regression stage
	const int64_t batch_size
) {
	if (hyper_param.is_pre_layernorm == false) {
		throw std::runtime_error("Gpt::forwardDecoder: is_pre_layernorm must be true");
	}

	// Calculate the number of tokens and max_input_len
	int64_t num_tokens = 0;
	for (int64_t i = 0; i < batch_size; ++i) {
		num_tokens += h_is_context_stage[i] ? h_input_len[i] : 1;
	}

	const int64_t local_q_head_num = hyper_param.num_q_heads / parallelism_param.tensor_para_size;
	const int64_t local_kv_head_num = hyper_param.num_kv_heads / parallelism_param.tensor_para_size;
	const int64_t local_ffn_size = hyper_param.ffn_inter_dim / parallelism_param.tensor_para_size;

	// Remalloc buffers
	qkv_buf.remalloc((num_tokens+15) * (local_q_head_num+2*local_kv_head_num) * hyper_param.head_dim);	// We add a `+15` here because of a requirement proposed by `fused_context_stage_attention_tensor_core`
	attn_out_buf.remalloc(num_tokens * local_q_head_num * hyper_param.head_dim);
	ffn_inter_buf_1.remalloc(num_tokens * local_ffn_size);
	ffn_inter_buf_2.remalloc(num_tokens * local_ffn_size);
	attention_out.remalloc(num_tokens * hyper_param.hidden_size);
	context_stage_kernel_m_buf.remalloc(local_q_head_num * num_tokens);
	context_stage_kernel_l_buf.remalloc(local_q_head_num * num_tokens);

	ith_context_req_req_index.remalloc(batch_size);
	ith_context_req_token_index.remalloc(batch_size+1);
	ith_decoding_req_req_index.remalloc(batch_size);
	ith_decoding_req_token_index.remalloc(batch_size);

	// Calculate indexes of requests in context stage and regression stage
	// Will be used in the attention layer (fusedDecodingStageAttentionKernel and fusedContextStageAttentionKernel)
	int64_t num_context_reqs = 0, num_decoding_reqs = 0;
	int64_t* ith_context_req_req_index_cpu = new int64_t[batch_size];
	int32_t* ith_context_req_token_index_cpu = new int32_t[batch_size+1];
	int64_t* ith_decoding_req_req_index_cpu = new int64_t[batch_size];
	int64_t* ith_decoding_req_token_index_cpu = new int64_t[batch_size];
	int64_t max_context_req_len = 0, max_decoding_req_len = 0;
	for (int64_t i = 0, cur_token_index = 0; i < batch_size; ++i) {
		if (h_is_context_stage[i]) {
			ith_context_req_req_index_cpu[num_context_reqs] = i;
			ith_context_req_token_index_cpu[num_context_reqs] = cur_token_index;
			max_context_req_len = std::max(max_context_req_len, h_input_len[i]);
			num_context_reqs += 1;
			cur_token_index += h_input_len[i];
		} else {
			ith_decoding_req_req_index_cpu[num_decoding_reqs] = i;
			ith_decoding_req_token_index_cpu[num_decoding_reqs] = cur_token_index;
			max_decoding_req_len = std::max(max_decoding_req_len, h_input_len[i]);
			num_decoding_reqs += 1;
			cur_token_index += 1;
		}
	}
	ith_context_req_token_index_cpu[num_context_reqs] = num_tokens;

	// Copy the indexes to GPU
	// TODO These memcpys are blocking calls which forces a synchronization. Optimize it.
	CUDA_CHECK(cudaMemcpy(ith_context_req_req_index.ptr, ith_context_req_req_index_cpu, sizeof(int64_t) * batch_size, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(ith_context_req_token_index.ptr, ith_context_req_token_index_cpu, sizeof(int32_t) * (batch_size+1), cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(ith_decoding_req_req_index.ptr, ith_decoding_req_req_index_cpu, sizeof(int64_t) * batch_size, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(ith_decoding_req_token_index.ptr, ith_decoding_req_token_index_cpu, sizeof(int64_t) * batch_size, cudaMemcpyHostToDevice));
	delete[] ith_context_req_req_index_cpu;
	delete[] ith_context_req_token_index_cpu;
	delete[] ith_decoding_req_req_index_cpu;
	delete[] ith_decoding_req_token_index_cpu;

	assert(parallelism_param.hyper_inited);
	// Run the forward pass
	for (int64_t layer_id = 0; layer_id < parallelism_param.local_layer_num; ++layer_id) {
		const T* cur_layer_input = layer_id == 0 ? d_input : d_output;
		const GptLayerWeight<T>& cur_layer_weight = weight.layer_weights[layer_id];

		// Here, the `cur_layer_input` will be used in Step 3: AddResidual

		// Step 1: Layernorm / RMSNorm
		// Input:
		//   cur_layer_input: [num_tokens, hidden_size]
		// Output:
		//   attention_out: [num_tokens, hidden_size]
		if (hyper_param.is_rmsnorm) {
			st::kernel::rmsnorm<T>(
				attention_out.ptr,
				cur_layer_input,
				cur_layer_weight.attn_layernorm_weight,
				weight.layernorm_epsilon,
				num_tokens,
				hyper_param.hidden_size
			);
		} else {
			st::kernel::layernorm<T>(
				attention_out.ptr,
				cur_layer_input,

				cur_layer_weight.attn_layernorm_weight,
				cur_layer_weight.attn_layernorm_bias,
				weight.layernorm_epsilon,

				num_tokens,
				hyper_param.hidden_size
			);
		}
		sync_check_cuda_error();

		// Step 2: Self-attention
		// Input:
		//   attention_out: [num_tokens, hidden_size]
		//   k_cache: [num_blocks, num_layers, local_kv_head_num, block_size, head_dim]
		//   v_cache: [num_blocks, num_layers, local_kv_head_num, block_size, head_dim]
		// Output:
		//   attention_out: [num_tokens, hidden_size]
		//	 k/v cache
		st::layer::attention<T>(
			attention_out.ptr,
			d_k_cache,
			d_v_cache,

			attention_out.ptr,
			d_input_len,
			h_is_context_stage,
			d_block_table,
			d_position_ids.ptr,

			num_context_reqs,
			num_decoding_reqs,
			ith_context_req_req_index.ptr,
			ith_context_req_token_index.ptr,
			ith_decoding_req_req_index.ptr,
			ith_decoding_req_token_index.ptr,
			max_context_req_len,
			max_decoding_req_len,

			cur_layer_weight.attn_qkv_kernel,
			hyper_param.is_attn_qkv_biased ? cur_layer_weight.attn_qkv_bias : nullptr,
			cur_layer_weight.attn_out_kernel,
			hyper_param.is_attn_out_biased ? cur_layer_weight.attn_out_bias : nullptr,

			batch_size,
			num_tokens,
			hyper_param.hidden_size,
			parallelism_param.local_layer_num,
			hyper_param.num_q_heads,
			hyper_param.num_kv_heads,
			hyper_param.head_dim,
			hyper_param.is_rotary_posi_embedding,
			layer_id,
			pagedattn_param.max_num_block_per_req,
			pagedattn_param.block_size,

			qkv_buf.ptr,
			attn_out_buf.ptr,
			context_stage_kernel_m_buf.ptr,
			context_stage_kernel_l_buf.ptr,

			cublas_wrapper,
			tensor_para_comm
		);
		sync_check_cuda_error();		

		// Step 4: AddResidual & LayerNorm
		// Input:
		//   attention_out: [num_tokens, hidden_size]
		//   cur_layer_input: [num_tokens, hidden_size]
		// Output:
		//   attention_out: [num_tokens, hidden_size]
		//   output: [num_tokens, hidden_size]
		// Here, the `attention_out` will be used in Step 6: AddResidual
		if (hyper_param.is_rmsnorm) {
			st::kernel::addbias<T>(
				attention_out.ptr,
				attention_out.ptr,
				cur_layer_input,
				num_tokens*hyper_param.hidden_size
			);
			sync_check_cuda_error();
			st::kernel::rmsnorm<T>(
				d_output,
				attention_out.ptr,

				cur_layer_weight.final_layernorm_weight,
				weight.layernorm_epsilon,

				num_tokens,
				hyper_param.hidden_size
			);
			sync_check_cuda_error();
		} else {
			// We fuse "AddResidual" and "LayerNorm" together
			st::kernel::layernorm<T>(
				d_output,
				attention_out.ptr,

				cur_layer_weight.final_layernorm_weight,
				cur_layer_weight.final_layernorm_bias,
				weight.layernorm_epsilon,

				num_tokens,
				hyper_param.hidden_size,

				attention_out.ptr,	// biased_input
				cur_layer_input		// residual
			);
			sync_check_cuda_error();
		}

		// Step 5: Ffn (TODO Gated FFN)
		// Input:
		//   output: [num_tokens, hidden_size]
		// Output:
		//   output: [num_tokens, hidden_size]
		if (hyper_param.is_gated_ffn) {
			st::layer::gatedFfn<T>(
				d_output,
				d_output,

				cur_layer_weight.ffn_fc1_weight,
				cur_layer_weight.ffn_fc2_weight,
				cur_layer_weight.ffn_fc3_weight,

				num_tokens,
				hyper_param.hidden_size,
				hyper_param.ffn_inter_dim,
				hyper_param.hidden_size,
				this->hyper_param.ffn_activation_type,

				ffn_inter_buf_1.ptr,
				ffn_inter_buf_2.ptr,

				cublas_wrapper,
				tensor_para_comm
			);
		} else {
			st::layer::ffn<T>(
				d_output,
				d_output,

				cur_layer_weight.ffn_fc1_weight,
				cur_layer_weight.ffn_fc1_bias,
				cur_layer_weight.ffn_fc2_weight,
				cur_layer_weight.ffn_fc2_bias,

				num_tokens,
				hyper_param.hidden_size,
				hyper_param.ffn_inter_dim,
				hyper_param.hidden_size,
				this->hyper_param.ffn_activation_type,

				ffn_inter_buf_1.ptr,

				cublas_wrapper,
				tensor_para_comm
			);
		}
		sync_check_cuda_error();

		// Step 6: AddResidual
		// Input:
		//   output: [num_tokens, hidden_size]
		//   attention_out: [num_tokens, hidden_size]
		// Output:
		//   output: [num_tokens, hidden_size]
		st::kernel::addbias<T>(
			d_output,
			d_output,
			attention_out.ptr,
			num_tokens*hyper_param.hidden_size
		);
		sync_check_cuda_error();
	};
}

template<typename T>
std::vector<int64_t> Gpt<T>::forward(
	const std::vector<std::vector<int64_t>> &input_tokens_batched,
	const std::vector<int64_t> &first_token_indexes,
	void* d_k_cache,
	void* d_v_cache,
	int64_t* d_block_table
) {
    int64_t batch_size = input_tokens_batched.size();
	assert (first_token_indexes.size() == static_cast<size_t>(batch_size));

	// Prepare d_input_lens, h_input_lens, h_is_context_stage, num_tokens
	int64_t* h_input_lens = new int64_t[batch_size];
    bool* h_is_context_stage = new bool[batch_size];
	int64_t* h_sum_prev_input_lens = new int64_t[batch_size];
    int64_t num_tokens = 0;
    for (int64_t i = 0; i < batch_size; i++) {
		// input_lens in decoding stage kernel DOES NOT contain the new token(s)
        h_is_context_stage[i] = first_token_indexes[i] == 0;
		h_input_lens[i] = h_is_context_stage[i] ? input_tokens_batched[i].size() : first_token_indexes[i];
		h_sum_prev_input_lens[i] = num_tokens;
        num_tokens += input_tokens_batched[i].size();
		if ((h_is_context_stage[i] ? h_input_lens[i]-1 : first_token_indexes[i]) >= hyper_param.max_position_embeddings) {
			fprintf(stderr, "Gpt::forward: input length (%s, last_token_position=%ld) exceeds max_position_embeddings (%ld)\n",
					h_is_context_stage[i] ? "context" : "decoding",
					h_is_context_stage[i] ? h_input_lens[i]-1 : first_token_indexes[i],
					hyper_param.max_position_embeddings);
			assert(0);
		}
    }
	d_input_lens.remalloc(batch_size);
	d_sum_prev_input_lens.remalloc(batch_size);
	CUDA_CHECK(cudaMemcpy(d_input_lens.ptr, h_input_lens, sizeof(int64_t) * batch_size, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_sum_prev_input_lens.ptr, h_sum_prev_input_lens, sizeof(int64_t)*batch_size, cudaMemcpyHostToDevice));
	sync_check_cuda_error();

    // Prepare d_decoder_input, d_decoder_output.
    d_decoder_input.remalloc(num_tokens * this->hyper_param.hidden_size);
	d_decoder_output.remalloc(num_tokens * this->hyper_param.hidden_size);
	sync_check_cuda_error();

	if (parallelism_param.is_first_stage() || hyper_param.is_rotary_posi_embedding) {
		this->getInputPosiIds(input_tokens_batched, first_token_indexes, num_tokens);
		sync_check_cuda_error();
	}

	if (parallelism_param.is_first_stage()) {
		// Input embedding & positional encoding
		// WARN the variable d_position_ids calculated by inputBatchEmbedAndPosiEncode will be used in forwardDecoder
		this->inputBatchEmbedAndPosiEncode(
			d_decoder_input.ptr,
			input_tokens_batched,
			num_tokens
		);
		sync_check_cuda_error();
	}
	else {
		if (parallelism_param.is_stage_leader()){
			st::util::stNcclRecv(
				d_decoder_input.ptr,
				num_tokens * hyper_param.hidden_size, 
				util::stGetDataType<T>().get_nccl_type(), 
				parallelism_param.pipeline_para_rank-1, 
				pipeline_para_comm, 
				pipeline_para_comm.stream
			);
		}
		if (parallelism_param.tensor_para_size > 1){
			st::util::stNcclBcast(
				d_decoder_input.ptr,
				num_tokens * hyper_param.hidden_size, 
				util::stGetDataType<T>().get_nccl_type(),
				0, 
				tensor_para_comm, 
				tensor_para_comm.stream
			);
		}
	}
    // forward pass
	this->forwardDecoder(
		d_decoder_output.ptr,
		d_decoder_input.ptr,
		(T*)d_k_cache,
        (T*)d_v_cache,
		d_block_table,
		d_input_lens.ptr,
		h_input_lens,
		h_is_context_stage,
		batch_size
	);
	sync_check_cuda_error();

	std::vector<int64_t> output_tokens(batch_size);
	
	if (parallelism_param.is_last_stage()){
		// Layernorm / RMSNorm
		if (hyper_param.is_rmsnorm) {
			st::kernel::rmsnorm<T>(
				d_decoder_output.ptr,
				d_decoder_output.ptr,

				weight.final_layernorm_weight,
				weight.layernorm_epsilon,

				num_tokens,
				hyper_param.hidden_size
			);
		} else {
			st::kernel::layernorm<T>(
				d_decoder_output.ptr,
				d_decoder_output.ptr,

				weight.final_layernorm_weight,
				weight.final_layernorm_bias,
				weight.layernorm_epsilon,

				num_tokens,
				hyper_param.hidden_size
			);
		}
		sync_check_cuda_error();
		
		// sampling
		// TODO (yinmin): add more sampling variants except for greedy sampling
		this->selectOutputTokenBatched(
			output_tokens.data(),
			d_decoder_output.ptr,
			num_tokens,
			d_sum_prev_input_lens.ptr,
			batch_size
		);
		sync_check_cuda_error();
	} 
	else {
		if (parallelism_param.is_stage_leader()){
			st::util::stNcclSend(
				d_decoder_output.ptr,
				num_tokens * hyper_param.hidden_size, 
				util::stGetDataType<T>().get_nccl_type(), 
				parallelism_param.pipeline_para_rank+1, 
				pipeline_para_comm, 
				pipeline_para_comm.stream
			);
		}
	}

	delete[] h_input_lens;
	delete[] h_is_context_stage;
	delete[] h_sum_prev_input_lens;

    return output_tokens;
}

template class Gpt<float>;
template class Gpt<half>;

} // namespace st::model

